The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
To come up with a classification model that will help the bank improve its services and minimize customers who renounce ther credit cards
# To help with reading and manipulating data
import pandas as pd
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To impute missing values
from sklearn.impute import KNNImputer
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
The nb_black extension is already loaded. To reload it, use: %reload_ext nb_black
bank = pd.read_csv("BankChurners.csv")
# Checking the number of rows and columns in the data
bank.shape
(10127, 21)
Observation
# Copying the data to another variable to avoid any changes to originl data
data = bank.copy()
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
data.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
Observation
data.duplicated().sum()
0
Observation
# let's check for missing values in the data
round(data.isnull().sum() / data.isnull().count() * 100, 2)
CLIENTNUM 0.000 Attrition_Flag 0.000 Customer_Age 0.000 Gender 0.000 Dependent_count 0.000 Education_Level 15.000 Marital_Status 7.400 Income_Category 0.000 Card_Category 0.000 Months_on_book 0.000 Total_Relationship_Count 0.000 Months_Inactive_12_mon 0.000 Contacts_Count_12_mon 0.000 Credit_Limit 0.000 Total_Revolving_Bal 0.000 Avg_Open_To_Buy 0.000 Total_Amt_Chng_Q4_Q1 0.000 Total_Trans_Amt 0.000 Total_Trans_Ct 0.000 Total_Ct_Chng_Q4_Q1 0.000 Avg_Utilization_Ratio 0.000 dtype: float64
Observation
# let's view the statistical summary of the numerical columns in the data
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.000 | 739177606.334 | 36903783.450 | 708082083.000 | 713036770.500 | 717926358.000 | 773143533.000 | 828343083.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.000 | 2.346 | 1.299 | 0.000 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.000 | 3.813 | 1.554 | 1.000 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
Observation
data.describe(exclude=np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
Observation
data["Attrition_Flag"].value_counts(1)
Existing Customer 0.839 Attrited Customer 0.161 Name: Attrition_Flag, dtype: float64
Observation
# Making a list of all categorical variables
cat_col = [
"Attrition_Flag",
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# Printing number of count of each unique value in each column
for column in cat_col:
print(data[column].value_counts())
print("-" * 40)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 ---------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 ---------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 ---------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 ---------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 ---------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 ----------------------------------------
Observation
# Dropping CLIENTNUM has it is a id column, which has no bearing on the analysis
data.drop(columns=["CLIENTNUM"], inplace=True)
data["Income_Category"] = data["Income_Category"].replace('abc',np.nan)
data["Income_Category"].value_counts()
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 2, 6))
else:
plt.figure(figsize=(n + 2, 6))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(data, "Attrition_Flag", perc=True)
Observation
labeled_barplot(data, "Gender", perc=True)
Observation
labeled_barplot(data, "Education_Level", perc=True)
Observation
labeled_barplot(data, "Marital_Status", perc=True)
Observation
labeled_barplot(data, "Income_Category", perc=True)
Observation
labeled_barplot(data, "Card_Category", perc=True)
Observation
histogram_boxplot(data, "Customer_Age")
Observation
labeled_barplot(data, "Dependent_count", perc=True)
Observation
histogram_boxplot(data, "Credit_Limit")
Observation
histogram_boxplot(data, "Total_Revolving_Bal")
Observation
histogram_boxplot(data, "Total_Trans_Amt")
Observation
histogram_boxplot(data, "Total_Trans_Ct")
Observation
# Finding the top 10 values of Total_Transaction Count
data.Total_Trans_Ct.nlargest(10)
9324 139 9586 138 9213 134 9629 132 9261 131 9269 131 9339 131 9728 131 9841 131 10085 131 Name: Total_Trans_Ct, dtype: int64
# Finding the rows of Total Transaction Count greater than 131
data[data["Total_Trans_Ct"] > 131]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9213 | Existing Customer | 32 | M | 1 | Uneducated | Single | $60K - $80K | Silver | 36 | 2 | 3 | 1 | 33711.000 | 1437 | 32274.000 | 0.942 | 14880 | 134 | 0.654 | 0.043 |
| 9324 | Existing Customer | 41 | M | 3 | NaN | Married | $120K + | Blue | 33 | 2 | 4 | 3 | 34516.000 | 638 | 33878.000 | 0.724 | 13085 | 139 | 0.675 | 0.018 |
| 9586 | Existing Customer | 56 | F | 1 | High School | Married | NaN | Blue | 49 | 1 | 2 | 1 | 17542.000 | 2517 | 15025.000 | 0.800 | 13939 | 138 | 0.792 | 0.143 |
| 9629 | Existing Customer | 42 | M | 2 | Graduate | Single | $60K - $80K | Silver | 36 | 3 | 3 | 2 | 34516.000 | 0 | 34516.000 | 0.774 | 12920 | 132 | 0.737 | 0.000 |
Observation
# Capping values for Total Transaction Count at highest value i.e. 131
data["Total_Trans_Ct"].clip(upper=131, inplace=True)
histogram_boxplot(data, "Total_Ct_Chng_Q4_Q1")
Observation
# Capping values for Total_Ct_Chng_Q4_Q1 at highest value i.e. 2
data["Total_Ct_Chng_Q4_Q1"].clip(upper=2, inplace=True)
histogram_boxplot(data, "Total_Amt_Chng_Q4_Q1")
Observation
# Capping values for Total_Amt_Chng_Q4_Q1 at highest value i.e. 2
data["Total_Amt_Chng_Q4_Q1"].clip(upper=2, inplace=True)
histogram_boxplot(data, "Avg_Utilization_Ratio")
Observation
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
stat="density",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
stat="density",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
sns.pairplot(data, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x1f68a9abca0>
plt.figure(figsize=(15, 7))
sns.heatmap(data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
Observation
# Dropping high corelated columns
data.drop(
columns=[
"Avg_Open_To_Buy",
"Total_Trans_Ct",
"Months_on_book",
"Avg_Utilization_Ratio",
],
inplace=True,
)
stacked_barplot(data, "Customer_Age", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Customer_Age All 1627 8500 10127 43 85 388 473 48 85 387 472 44 84 416 500 46 82 408 490 45 79 407 486 49 79 416 495 47 76 403 479 41 76 303 379 50 71 381 452 54 69 238 307 40 64 297 361 42 62 364 426 53 59 328 387 52 58 318 376 51 58 340 398 55 51 228 279 39 48 285 333 38 47 256 303 56 43 219 262 59 40 117 157 37 37 223 260 57 33 190 223 58 24 133 157 36 24 197 221 35 21 163 184 33 20 107 127 34 19 127 146 32 17 89 106 61 17 76 93 62 17 76 93 30 15 55 70 31 13 78 91 60 13 114 127 65 9 92 101 63 8 57 65 29 7 49 56 26 6 72 78 64 5 38 43 27 3 29 32 28 1 28 29 66 1 1 2 68 1 1 2 67 0 4 4 70 0 1 1 73 0 1 1 ------------------------------------------------------------------------------------------------------------------------
Observation
stacked_barplot(data, "Gender", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
Observation
sns.countplot(data=data, x="Education_Level", hue="Attrition_Flag")
<AxesSubplot:xlabel='Education_Level', ylabel='count'>
Observation
sns.countplot(data=data, x="Marital_Status", hue="Attrition_Flag")
<AxesSubplot:xlabel='Marital_Status', ylabel='count'>
Observation
sns.countplot(data=data, x="Income_Category", hue="Attrition_Flag")
<AxesSubplot:xlabel='Income_Category', ylabel='count'>
Observation
sns.countplot(data=data, x="Card_Category", hue="Attrition_Flag")
<AxesSubplot:xlabel='Card_Category', ylabel='count'>
Observation
# Total_Relationship_Count
sns.countplot(data=data, x="Total_Relationship_Count", hue="Attrition_Flag")
<AxesSubplot:xlabel='Total_Relationship_Count', ylabel='count'>
Observation
sns.countplot(data=data, x="Months_Inactive_12_mon", hue="Attrition_Flag")
<AxesSubplot:xlabel='Months_Inactive_12_mon', ylabel='count'>
Observation
distribution_plot_wrt_target(data, "Customer_Age", "Attrition_Flag")
Observation
distribution_plot_wrt_target(data, "Credit_Limit", "Attrition_Flag")
Observation
distribution_plot_wrt_target(data, "Total_Trans_Amt", "Attrition_Flag")
Observation
cols = data[
["Total_Trans_Amt", "Credit_Limit", "Total_Revolving_Bal", "Total_Ct_Chng_Q4_Q1"]
].columns.tolist()
plt.figure(figsize=(10, 10))
for i, variable in enumerate(cols):
plt.subplot(3, 2, i + 1)
sns.boxplot(data["Attrition_Flag"], data[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
Observation
There is a perfect Corelation (1.0) between Credit_Limit and Avg_Open_to_Buy There is a high corelation (0.8) between Total Trans_amt and Total_Trans_Ct There is a high corelation (0.79) between Months_on_book and Customer Age There is some corelation (0.62) between Avg_Open_to_Buy and Total_Revolving_Bal Due to corelation, following columns are dropped from the dataset -Avg_Open_to_Buy -Total_Trans_Ct -Months_on_book -Avg_Utilization_Ratio
Customer Atrrition was highest on Ages 68, 66 and 59
There is no noticeable difference between Male and Female vs Attrition, though Female customers are a bit more in number in renouncing Credit Cards
Graduate Class holds the majority of Existing Customers. At the same time this class has the highest Attrition as well. High School educated class holds the next level as Existing Customers Comparatively Attrition is less in Uneducated and Post-Graduate class
Married Customers forms the majority of existing customers Attrition is almost similar among Married and Single Customers Attrition is lowest amongs the Divorced Customers
Majority of existing customers fall under Income Levels of less than 40K Attrition is also highest in this category The next highest category of customers is in 40-60k.
Majority of existing customers have subscribed to the Blue Credit Card The next category of existing customers is under Silver Category Gold and Platinum Cards have very minimal subscription
Majority of Existing Customers as well as Attrited Customers have around 3 banking products Attrition decreases as they hold more banking products (upto 6) Attrition increases from number of products held by customers from 1 to 3.
Customers who are inactive are maximum for 1-3 months of incativity (in a 12 month period) Attrition also gets higher around 3 months of inactivity.
Credit Limit for the majority of Customers are around 1500 and under 5000. These segment has many outliers. There are some extreme Credit Limits granted for a few around 35000
The range of Total_Trans_Amt is comparatively lower for Attrited Customers Credit Limit Values for Attrited Customers is also relatively lower for Attrited Customers (this has outliers) Total_Revolving_Bal for Attrited Customers is lower when compared to exiting customers Total_CT-Chng_Q4_Q1 values are also lower for Attrited Customers. (with outliers)
# Reducing the categories of Education Level by merging certain categories
# Replacing College with Graduate, with an assumption that Some of College educated customers may be working part-time and earning similar to a Graduate
# Replacing Doctorate with Post-Graduate
data["Education_Level"] = data["Education_Level"].replace("College", "Graduate")
data["Education_Level"] = data["Education_Level"].replace("Doctorate", "Post-Graduate")
We will use KNN imputer to impute missing values.
KNNImputer: Each sample's missing values are imputed by looking at the n_neighbors nearest neighbors found in the training set. Default value for n_neighbors=5.data.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 1112 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Ct_Chng_Q4_Q1 0 dtype: int64
imputer = KNNImputer(n_neighbors=5)
# defining a list with names of columns that will be used for imputation
reqd_col_for_impute = [
"Education_Level",
"Marital_Status",
"Income_Category",
]
data[reqd_col_for_impute].tail(10)
| Education_Level | Marital_Status | Income_Category | |
|---|---|---|---|
| 10117 | Graduate | Married | $80K - $120K |
| 10118 | NaN | NaN | $80K - $120K |
| 10119 | Uneducated | Single | NaN |
| 10120 | High School | Single | $60K - $80K |
| 10121 | Graduate | Single | Less than $40K |
| 10122 | Graduate | Single | $40K - $60K |
| 10123 | NaN | Divorced | $40K - $60K |
| 10124 | High School | Married | Less than $40K |
| 10125 | Graduate | NaN | $40K - $60K |
| 10126 | Graduate | Married | Less than $40K |
# Copying the dataset to data1 for further processing
data1 = data.copy()
# Printing number of count of each unique value in each column
for column in reqd_col_for_impute:
print(data1[column].value_counts())
print("-" * 40)
Graduate 4141 High School 2013 Uneducated 1487 Post-Graduate 967 Name: Education_Level, dtype: int64 ---------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 ---------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64 ----------------------------------------
# Converting categorical columns to Numerical Values to do KNN imputation
Education_Level = {"Graduate": 0, "High School": 1, "Uneducated": 2, "Post-Graduate": 3}
data1["Education_Level"] = data1["Education_Level"].map(Education_Level)
Marital_Status = {"Married": 0, "Single": 1, "Divorced": 2}
data1["Marital_Status"] = data1["Marital_Status"].map(Marital_Status)
Income_Category = {
"Less than $40K": 0,
"$40K - $60K": 1,
"$80K - $120K": 2,
"$60K - $80K": 3,
"$120K +": 4,
}
data1["Income_Category"] = data1["Income_Category"].map(Income_Category)
data1.tail(10)
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Ct_Chng_Q4_Q1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10117 | Existing Customer | 57 | M | 2 | 0.000 | 0.000 | 2.000 | Blue | 6 | 3 | 4 | 17925.000 | 1909 | 0.712 | 17498 | 0.820 |
| 10118 | Attrited Customer | 50 | M | 1 | NaN | NaN | 2.000 | Blue | 6 | 3 | 4 | 9959.000 | 952 | 0.825 | 10310 | 1.100 |
| 10119 | Attrited Customer | 55 | F | 3 | 2.000 | 1.000 | NaN | Blue | 4 | 3 | 3 | 14657.000 | 2517 | 0.166 | 6009 | 0.514 |
| 10120 | Existing Customer | 54 | M | 1 | 1.000 | 1.000 | 3.000 | Blue | 5 | 2 | 0 | 13940.000 | 2109 | 0.660 | 15577 | 0.754 |
| 10121 | Existing Customer | 56 | F | 1 | 0.000 | 1.000 | 0.000 | Blue | 4 | 1 | 4 | 3688.000 | 606 | 0.570 | 14596 | 0.791 |
| 10122 | Existing Customer | 50 | M | 2 | 0.000 | 1.000 | 1.000 | Blue | 3 | 2 | 3 | 4003.000 | 1851 | 0.703 | 15476 | 0.857 |
| 10123 | Attrited Customer | 41 | M | 2 | NaN | 2.000 | 1.000 | Blue | 4 | 2 | 3 | 4277.000 | 2186 | 0.804 | 8764 | 0.683 |
| 10124 | Attrited Customer | 44 | F | 1 | 1.000 | 0.000 | 0.000 | Blue | 5 | 3 | 4 | 5409.000 | 0 | 0.819 | 10291 | 0.818 |
| 10125 | Attrited Customer | 30 | M | 2 | 0.000 | NaN | 1.000 | Blue | 4 | 3 | 3 | 5281.000 | 0 | 0.535 | 8395 | 0.722 |
| 10126 | Attrited Customer | 43 | F | 2 | 0.000 | 0.000 | 0.000 | Silver | 6 | 2 | 4 | 10388.000 | 1961 | 0.703 | 10294 | 0.649 |
# Splitting the data to set of Dependent Variable (Y:Attrition_Flag) and Independent variables (X)
X = data1.drop(["Attrition_Flag"], axis=1)
y = data1["Attrition_Flag"].apply(lambda x: 1 if x == "Existing Customer" else 0)
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 15) (2026, 15) (2026, 15)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in validation data =", X_val.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 6075 Number of rows in validation data = 2026 Number of rows in test data = 2026
# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])
# Transform the validation data
X_val[reqd_col_for_impute] = imputer.transform(X_val[reqd_col_for_impute])
# Transform the test data
X_test[reqd_col_for_impute] = imputer.transform(X_test[reqd_col_for_impute])
# Checking that no column has missing values in train, validation or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Ct_Chng_Q4_Q1 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Ct_Chng_Q4_Q1 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Ct_Chng_Q4_Q1 0 dtype: int64
Observation
## Function to inverse the encoding
def inverse_mapping(x, y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype("category")
X_val[y] = np.round(X_val[y]).map(inv_dict).astype("category")
X_test[y] = np.round(X_test[y]).map(inv_dict).astype("category")
inverse_mapping(Education_Level, "Education_Level")
inverse_mapping(Marital_Status, "Marital_Status")
inverse_mapping(Income_Category, "Income_Category")
# Verifying for training data set
cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_train[i].value_counts())
print("*" * 30)
F 3205 M 2870 Name: Gender, dtype: int64 ****************************** Graduate 2606 High School 1847 Uneducated 1048 Post-Graduate 574 Name: Education_Level, dtype: int64 ****************************** Married 3036 Single 2590 Divorced 449 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 2249 $40K - $60K 1411 $80K - $120K 1132 $60K - $80K 847 $120K + 436 Name: Income_Category, dtype: int64 ****************************** Blue 5668 Silver 327 Gold 71 Platinum 9 Name: Card_Category, dtype: int64 ******************************
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 23) (2026, 23) (2026, 23)
X_train.head()
| Customer_Age | Dependent_count | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Ct_Chng_Q4_Q1 | Gender_M | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9501 | 47 | 2 | 1 | 2 | 2 | 21714.000 | 1969 | 0.944 | 13270 | 0.625 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 5065 | 49 | 4 | 5 | 1 | 4 | 7789.000 | 957 | 0.724 | 3412 | 0.842 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2375 | 53 | 2 | 6 | 1 | 3 | 3176.000 | 1470 | 0.388 | 1634 | 0.472 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7579 | 56 | 2 | 3 | 3 | 1 | 3296.000 | 1435 | 0.968 | 4327 | 0.737 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2776 | 47 | 4 | 3 | 3 | 3 | 17557.000 | 0 | 0.667 | 2142 | 0.378 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Model can make wrong prections
Important Case. Losing a customer is more important, so that Bank can necessary actions to retain the Credit Card Customer
How to reduce the lose ? Bank would want to Recall to be maximized, so that less chance of false negatives
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Logistic Regression", LogisticRegression(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 96.8820450653274 Random forest: 98.82339471608074 GBM: 98.56839654409359 Adaboost: 97.09742346398814 Logistic Regression: 97.90155669726182 dtree: 95.31293655833284 Training Performance: Bagging: 99.78427142576976 Random forest: 100.0 GBM: 98.90174544028241 Adaboost: 97.4112571092371 Logistic Regression: 97.84271425769758 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observation
print("Before Oversampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After Oversampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After Oversampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After Oversampling, the shape of train_X: {}".format(X_train_over.shape))
print("After Oversampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before Oversampling, counts of label 'Yes': 5099 Before Oversampling, counts of label 'No': 976 After Oversampling, counts of label 'Yes': 5099 After Oversampling, counts of label 'No': 5099 After Oversampling, the shape of train_X: (10198, 23) After Oversampling, the shape of train_y: (10198,)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Logistic Regression", LogisticRegression(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train_over, y_train_over)
scores = recall_score(y_train_over, model.predict(X_train_over)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 93.41045623352382 Random forest: 96.5875810579384 GBM: 96.17579710981546 Adaboost: 94.07727683811503 Logistic Regression: 79.13280994438992 dtree: 92.99861455867922 Training Performance: Bagging: 99.47048440870759 Random forest: 100.0 GBM: 96.84251814081193 Adaboost: 94.68523239850951 Logistic Regression: 78.28986075701117 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison Over Sample")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observation
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Undersampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Undersampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Undersampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Undersampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Undersampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Undersampling, counts of label 'Yes': 5099 Before Undersampling, counts of label 'No': 976 After Undersampling, counts of label 'Yes': 976 After Undersampling, counts of label 'No': 976 After Undersampling, the shape of train_X: (1952, 23) After Undersampling, the shape of train_y: (1952,)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Logistic Regression", LogisticRegression(random_state=1)))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train_un, y_train_un)
scores = recall_score(y_train_un, model.predict(X_train_un)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 88.72684458398744 Random forest: 91.39194139194139 GBM: 92.62428048142333 Adaboost: 89.24228152799583 Logistic Regression: 73.8770277341706 dtree: 89.23652537938253 Training Performance: Bagging: 99.18032786885246 Random forest: 100.0 GBM: 96.31147540983606 Adaboost: 91.39344262295081 Logistic Regression: 75.30737704918032 dtree: 100.0
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))
fig.suptitle("Algorithm Comparison Under Sample")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Observation
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
#lets calculate the values of metrics for each of these Algorithms
# RandomForest
rf=RandomForestClassifier(random_state=1)
rf.fit(X_train,y_train)
RandomForestClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=rf, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
rf_train_perf = model_performance_classification_sklearn(rf, X_train, y_train)
print("Training performance:")
rf_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
rf_val_perf = model_performance_classification_sklearn(rf, X_val, y_val)
print("Validation performance:")
rf_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.938 | 0.988 | 0.941 | 0.964 |
Observation
%%time
# Choose the type of classifier.
rf2 = RandomForestClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {"n_estimators": [150,200,250],
"min_samples_leaf": np.arange(5, 10),
"max_features": np.arange(0.2, 0.7, 0.1),
"max_samples": np.arange(0.3, 0.7, 0.1),
"max_depth":np.arange(3,4,5),
"class_weight" : ['balanced', 'balanced_subsample'],
"min_impurity_decrease":[0.001, 0.002, 0.003]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the random search
grid_obj = RandomizedSearchCV(rf2, parameters,n_iter=30, scoring=acc_scorer,cv=5, random_state = 1, n_jobs = -1, verbose = 2)
# using n_iter = 30, so randomized search will try 30 different combinations of hyperparameters
# by default, n_iter = 10
grid_obj = grid_obj.fit(X_train, y_train)
# Print the best combination of parameters
grid_obj.best_params_
Fitting 5 folds for each of 30 candidates, totalling 150 fits Wall time: 34.6 s
{'n_estimators': 150,
'min_samples_leaf': 5,
'min_impurity_decrease': 0.001,
'max_samples': 0.3,
'max_features': 0.2,
'max_depth': 3,
'class_weight': 'balanced_subsample'}
# Set the clf to the best combination of parameters
rf2_tuned = RandomForestClassifier(
class_weight="balanced_subsample",
max_features=0.2,
max_samples=0.3,
min_samples_leaf=5,
n_estimators=150,
random_state=1,
max_depth=3,
min_impurity_decrease=0.001,
)
# Fit the best algorithm to the data.
rf2_tuned.fit(X_train, y_train)
RandomForestClassifier(class_weight='balanced_subsample', max_depth=3,
max_features=0.2, max_samples=0.3,
min_impurity_decrease=0.001, min_samples_leaf=5,
n_estimators=150, random_state=1)
# Calculating different metrics on train set
rf2_tuned_random_train = model_performance_classification_sklearn(
rf2_tuned, X_train, y_train
)
print("Training performance:")
rf2_tuned_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.871 | 0.879 | 0.963 | 0.920 |
# Calculating different metrics on validation set
rf2_tuned_random_val = model_performance_classification_sklearn(rf2_tuned, X_val, y_val)
print("Validation performance:")
rf2_tuned_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.874 | 0.892 | 0.955 | 0.922 |
Observation
# lets calculate the values of metrics for each of these Algorithms
# Gradient Boost
gb = GradientBoostingClassifier(random_state=1)
gb.fit(X_train, y_train)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=gb, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
gb_train_perf = model_performance_classification_sklearn(gb, X_train, y_train)
print("Training performance:")
gb_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.964 | 0.989 | 0.969 | 0.979 |
# Calculating different metrics on validation set
gb_val_perf = model_performance_classification_sklearn(gb, X_val, y_val)
print("Validation performance:")
gb_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.952 | 0.988 | 0.956 | 0.972 |
Observation
%%time
# Choose the type of classifier.
gb2 = GradientBoostingClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {"n_estimators": [150,200,250],
"max_depth":np.arange(3,4,5),
'learning_rate':[0.01,0.1,0.2,0.05]
}
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the random search
grid_obj = RandomizedSearchCV(gb2, parameters,n_iter=30, scoring=acc_scorer,cv=5, random_state = 1, n_jobs = -1, verbose = 2)
# using n_iter = 30, so randomized search will try 30 different combinations of hyperparameters
# by default, n_iter = 10
grid_obj = grid_obj.fit(X_train, y_train)
# Print the best combination of parameters
grid_obj.best_params_
Fitting 5 folds for each of 12 candidates, totalling 60 fits Wall time: 33.6 s
{'n_estimators': 150, 'max_depth': 3, 'learning_rate': 0.01}
# Set the clf to the best combination of parameters
gb2_tuned = GradientBoostingClassifier(
learning_rate=0.01, n_estimators=150, random_state=1, max_depth=3,
)
# Fit the best algorithm to the data.
gb2_tuned.fit(X_train, y_train)
GradientBoostingClassifier(learning_rate=0.01, n_estimators=150, random_state=1)
# Calculating different metrics on train set
gb2_tuned_random_train = model_performance_classification_sklearn(
gb2_tuned, X_train, y_train
)
print("Training performance:")
gb2_tuned_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.909 | 0.991 | 0.909 | 0.948 |
# Calculating different metrics on validation set
gb2_tuned_random_val = model_performance_classification_sklearn(gb2_tuned, X_val, y_val)
print("Validation performance:")
gb2_tuned_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.904 | 0.991 | 0.904 | 0.946 |
Observation
# lets calculate the values of metrics for each of these Algorithms
# Logistic Regression
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
lr_train_perf = model_performance_classification_sklearn(lr, X_train, y_train)
print("Training performance:")
lr_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.878 | 0.978 | 0.888 | 0.931 |
# Calculating different metrics on validation set
lr_val_perf = model_performance_classification_sklearn(lr, X_val, y_val)
print("Validation performance:")
lr_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.866 | 0.979 | 0.875 | 0.924 |
Observation
%%time
# Choose the type of classifier.
lr2 = LogisticRegression(random_state=1)
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
#space['C'] = loguniform(1e-5, 100)
# Type of scoring used to compare parameter combinations
acc_scorer = metrics.make_scorer(metrics.recall_score)
# Run the random search
grid_obj = RandomizedSearchCV(lr2, space, n_iter=500, scoring=acc_scorer, n_jobs=-1, cv=cv, random_state=1)
# using n_iter = 30, so randomized search will try 30 different combinations of hyperparameters
# by default, n_iter = 10
grid_obj = grid_obj.fit(X_train, y_train)
# Print the best combination of parameters
grid_obj.best_params_
Wall time: 16.5 s
{'solver': 'liblinear', 'penalty': 'l2'}
# Set the clf to the best combination of parameters
lr2_tuned = LogisticRegression(solver="liblinear", penalty="l2",)
# Fit the best algorithm to the data.
lr2_tuned.fit(X_train, y_train)
LogisticRegression(solver='liblinear')
# Calculating different metrics on train set
lr2_tuned_random_train = model_performance_classification_sklearn(
lr2_tuned, X_train, y_train
)
print("Training performance:")
lr2_tuned_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.880 | 0.979 | 0.889 | 0.932 |
# Calculating different metrics on validation set
lr2_tuned_random_val = model_performance_classification_sklearn(lr2_tuned, X_val, y_val)
print("Validation performance:")
lr2_tuned_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.870 | 0.983 | 0.877 | 0.927 |
Observation
# training performance comparison
models_train_comp_df = pd.concat(
[
rf_train_perf.T,
rf2_tuned_random_train.T,
gb_train_perf.T,
gb2_tuned_random_train.T,
lr_train_perf.T,
lr2_tuned_random_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"RandomForest",
"RandomForest Tuned with Random Search",
"Gradient Boost",
"Gradient Boost Tuned with Random Search",
"Logistic Regression",
"Logistic Regression Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| RandomForest | RandomForest Tuned with Random Search | Gradient Boost | Gradient Boost Tuned with Random Search | Logistic Regression | Logistic Regression Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 1.000 | 0.871 | 0.964 | 0.909 | 0.878 | 0.880 |
| Recall | 1.000 | 0.879 | 0.989 | 0.991 | 0.978 | 0.979 |
| Precision | 1.000 | 0.963 | 0.969 | 0.909 | 0.888 | 0.889 |
| F1 | 1.000 | 0.920 | 0.979 | 0.948 | 0.931 | 0.932 |
# validation performance comparison
models_val_comp_df = pd.concat(
[
rf_val_perf.T,
rf2_tuned_random_val.T,
gb_val_perf.T,
gb2_tuned_random_val.T,
lr_val_perf.T,
lr2_tuned_random_val.T,
],
axis=1,
)
models_val_comp_df.columns = [
"RandomForest",
"RandomForest Tuned with Random Search",
"Gradient Boost",
"Gradient Boost Tuned with Random Search",
"Logistic Regression",
"Logistic Regression Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| RandomForest | RandomForest Tuned with Random Search | Gradient Boost | Gradient Boost Tuned with Random Search | Logistic Regression | Logistic Regression Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 0.938 | 0.874 | 0.952 | 0.904 | 0.866 | 0.870 |
| Recall | 0.988 | 0.892 | 0.988 | 0.991 | 0.979 | 0.983 |
| Precision | 0.941 | 0.955 | 0.956 | 0.904 | 0.875 | 0.877 |
| F1 | 0.964 | 0.922 | 0.972 | 0.946 | 0.924 | 0.927 |
Observation
# Calculating different metrics on test set
gb2_tuned_random_test = model_performance_classification_sklearn(
gb2_tuned, X_test, y_test
)
print("Test performance:")
gb2_tuned_random_test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.900 | 0.995 | 0.898 | 0.944 |
Observation
feature_names = X_train.columns
importances = gb2_tuned.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(15, 15))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
Observation
We will create 2 different pipelines, one for numerical columns and one for categorical columns For numerical columns, we will do missing value imputation as pre-processing For categorical columns, we will do one hot encoding and missing value imputation as pre-processing
We are doing missing value imputation for the whole data, so that if there is any missing value in the data in future that can be taken care of.
# creating a list of numerical variables
numerical_features = [
"Customer_Age",
"Dependent_count",
"Education_Level",
"Marital_Status",
"Income_Category",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Ct_Chng_Q4_Q1",
]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# creating a transformer for categorical variables, which will first apply simple imputer and
# then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any change
# Separating target variable and other variables
X = data1.drop("Attrition_Flag", axis=1)
Y = data1["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 15) (3039, 15)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"GB",
GradientBoostingClassifier(
learning_rate=0.01, n_estimators=150, random_state=1, max_depth=3,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Dependent_count',
'Education_Level',
'Marital_Status',
'Income_Category',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'...ng_Q4_Q1',
'Total_Trans_Amt',
'Total_Ct_Chng_Q4_Q1']),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Education_Level',
'Marital_Status',
'Income_Category',
'Card_Category'])])),
('GB',
GradientBoostingClassifier(learning_rate=0.01,
n_estimators=150,
random_state=1))])
# transforming and predicting on test data
model.predict(X_test)
array(['Existing Customer', 'Existing Customer', 'Existing Customer', ...,
'Attrited Customer', 'Existing Customer', 'Existing Customer'],
dtype=object)